#Import necessary libraries
import pandas as pd
import numpy as np
import sqlite3
import warnings
import plotly.express as px
import folium
from folium.plugins import HeatMapWithTime
from folium.plugins import HeatMap
from folium.plugins import FastMarkerCluster
from folium.plugins import MarkerCluster
from folium.plugins import DualMap
from sklearn.model_selection import train_test_split
from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import average_precision_score
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from scipy.stats import uniform as sp_randfloat
from sklearn.preprocessing import LabelEncoder
import lightgbm
from lightgbm import LGBMModel,LGBMClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from catboost import CatBoostClassifier
import matplotlib.ticker as mtick
from tqdm import tqdm
import pygeohash as pgh
import calmap
import plotly.express as px
from functools import partial
import matplotlib.pyplot as plt
from folium import plugins
import seaborn as sns
import pygeohash as pgh
import geohash_to_geojson
from scipy.spatial.distance import cityblock
import pygeohash as pgh
from polygon_geohasher.polygon_geohasher import geohashes_to_polygon
from IPython.display import HTML, display
import pprint
%matplotlib inline
HTML('''
<style>
.output_png {
display: table-cell;
text-align: center;
vertical-align: middle;
}
</style>
<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>
''')
Jasper Kristian Pangan | MSDS 2021
Most of us have experienced the carmageddon along EDSA. Traffic is one of the biggest crisis in our country that several administrations failed to solve. One of the major causes of the heavy traffic in our country is the poor public transportation system [1]. With the constant breakdown of the country's main transit system [2] and lack of options due to insanely long queue and madness experienced when using public transportation, many Filipinos resort to alternatives. In particular, ride sharing services has grown exponentially during the last decade, specifically Grab.
In 2018 alone, Grab users traveled the cumulative total of 920 million kilometers [3] with over 35,000 partner drivers. Grabe reported to have cut down travel time by 70%, but still customers demand for better service [4]. Since Grab receives around 600,000 bookings per day, [5] they experience an undersupply of vehicles to meet the passenger demand. Hence, it is a common frustration among Filipinos booking for Grab to experience the following: (1) waiting for a long time to book a ride to no avail, (2) costly ride, and (3) drivers cancelling their ride.
This study focuses on the non-allocation of Grab Taxi bookings. The data was downloaded from this site and covers the period from August 8, 2013 to December 20, 2013. This study aims to answer the question, How can we predict the status of a booking of Grab Taxi?
Predicting the status of a grab booking will be beneficial to different stakeholders such as the Grab passengers, Grab partner drivers, Grab, and the local government units.
hour_18, day_of_week_Friday, and day_of_week_Sunday. These features were determined based on gain.To be able to predict the status of a Grab booking, a total of 109,463 transactions from August 8 to December 20, 2013 was downloaded from this Tableau Public. The general workflow for classification as shown in Figure 1 involves the following steps:
Each step of the workflow will be discussed in the succeeding sections.
The dataset was downloaded and extracted from Tableau Public. Prior to filtering and data processing, the Grab dataset is composed of 197,188 rows and 21 columns. A row corresponds to a booking information for Grab Taxi.
df_original = pd.read_csv('DataSeerGrabPrizeData-1.csv')
df_original.columns = df_original.columns.str.lower()
df_original['status'] = df_original['status'].replace({'Cancelled':'Allocated',
'Completed':'Allocated',
'Unallocated':'Unallocated',
'Unal':'Unallocated',})
Data preprocessing was implemented on the acquired data. Data processing includes:
citystatus equal to either Completed or Cancelleddate from string to datetime for easier processing# Drop calculated columns in Tableau created by the Author
df = df_original.drop(df_original.columns[range(10)],axis=1)
# Rename columns
df.columns = df.columns.str.replace(' ','_')
# Update the datatype of `Date` column from object to
df['date'] = pd.to_datetime(df['date'], format='%d/%m/%Y %I:%M:%S %p')
# Drop unnecessary columns
df = df.drop(['platform','source'], axis=1)
# Consider only Metro Manila bookings
df = df[df['city']=='Metro Manila'].copy()
df = df.drop(['city'], axis=1)
# Define a bounding box as defined from: https://gist.github.com/graydon/11198540
ph_lon = [120.8661, 121.3440]
ph_lat = [14.1672, 14.8439]
df = df[(df['pick_up_latitude'] >= ph_lat[0]) &
(df['pick_up_latitude'] <= ph_lat[1])]
df = df[(df['drop_off_latitude'] >= ph_lat[0]) &
(df['pick_up_latitude'] <= ph_lat[1])]
df = df[(df['pick_up_longitude'] >= ph_lon[0]) &
(df['pick_up_longitude'] <= ph_lon[1])]
df = df[(df['drop_off_longitude'] >= ph_lon[0]) &
(df['pick_up_longitude'] <= ph_lon[1])]
# Create Date columns (Month, Weeknumber, Weekday, Day, etc.)
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['week'] = df['date'].dt.week
df['day_of_week'] = df['date'].dt.day_name()
df['hour'] = df['date'].dt.hour
# Create is_weekday column
df['weekend'] = df['date'].dt.weekday.replace({0 : 0, 1 : 0, 2 : 0, 3 : 0, 4 : 0, 5: 1, 6 : 1})
To be able to boost the performance of the machine learning algorithm, features were extracted from the raw data. The following process and features are described below:
drop_off_city and pick_up_city via reverse geocodedrop_off_geohash and pick_up_geohash via geohashing with precision set to 5 (4.89km x 4.89km)trip_distance using Manhattan distancedef reverse_geocode(df, *args):
import reverse_geocode
cities = []
for lat, lon in tqdm(zip(df[args[0]],df[args[1]])):
query= (lat,lon),
res = reverse_geocode.search(query)
cities.append(res[0]['city'])
return cities
df['drop_off_city'] = reverse_geocode(
df, 'drop_off_latitude', 'drop_off_longitude')
df['pick_up_city'] = reverse_geocode(
df, 'pick_up_latitude', 'pick_up_longitude')
# Correct cities based on city for Metro Manila, province otherwise
mapping = {'Calumpang':'Marikina',
'Bagong Pagasa':'Quezon City',
'Taytay':'Rizal',
'Cainta':'Rizal',
'Del Monte':'Quezon City',
'Port Area':'Manila',
"Sambayanihan People's Village":'Las Piñas',
'Quiapo':'Manila',
'Antipolo':'Rizal',
'San Mateo':'Rizal',
'Niugan':'Laguna',
'Teresa':'Rizal',
'Imus':'Cavite',
'Tanza':'Cavite',
'Meycauayan':'Bulacan',
'Malanday':'Valenzuela',
'Obando':'Bulacan',
'Loma de Gato':'Bulacan',
'Carmona':'Cavite',
'San Jose del Monte':'Bulacan',
'Angono':'Rizal',
'Plaridel':'Rizal',
'Bacoor':'Cavite',
'Cavite City':'Cavite',
'Rodriguez':'Rizal',
'Pinugay':'Rizal',
'Cardona':'Rizal',
'Marilao':'Bulacan',
'Balagtas':'Bulacan',
'General Trias':'Cavite',
'Bocaue':'Bulacan',
'Binangonan':'Rizal',
'Kawit':'Cavite',
'Biñan':'Laguna',
'Santa Rosa':'Laguna',
'San Pedro':'Laguna',
'Cabuyao':'Cavite',
'Dasmariñas':'Cavite',
'Aplaya':'Laguna',
'Guiguinto':'Bulacan',
'Guyong':'Bulacan',
'Kanluran':'Cavite',
'Magsaysay':'Laguna',
'Noveleta':'Cavite',
'Pulong Santa Cruz':'Laguna',
'Sampaloc':'Laguna',
'Sulucan':'Cavite'
}
df['drop_off_city'] = df['drop_off_city'].replace(mapping)
df['pick_up_city'] = df['pick_up_city'].replace(mapping)
# Create tags for drop off and pick up if within metro manila
mm_cities = (['Pateros', 'San Juan', 'Pasay', 'Marikina',
'Makati City', 'Pasig City', 'Quezon City',
'Mandaluyong City', 'Manila', 'Las Piñas',
'Caloocan City', 'Taguig', 'Valenzuela',
'Muntinlupa', 'Malabon', 'Navotas'])
df['drop_off_within_mm'] = df['drop_off_city'].apply(lambda x: 1
if x in mm_cities else 0)
df['pick_up_within_mm'] = df['pick_up_city'].apply(lambda x: 1
if x in mm_cities else 0)
def geohash(df, *args):
geohash=[]
for lat, lon in tqdm(zip(df[args[0]],df[args[1]])):
geohash.append(pgh.encode(lat, lon, precision=5))
return geohash
df['drop_off_geohash'] = geohash(df,'drop_off_latitude','drop_off_longitude')
df['pick_up_geohash'] = geohash(df,'pick_up_latitude','pick_up_longitude')
df = df[df['drop_off_geohash']!='wdw1t']
def cityblock_distance():
dist = []
for i, j in zip(df[['pick_up_latitude', 'pick_up_longitude']].to_numpy(),
df[['drop_off_latitude', 'drop_off_longitude']].to_numpy()):
dist.append(cityblock(i,j))
return dist
df['trip_distance'] = cityblock_distance()
df_eda = df.copy()
df_eda.to_csv('df_premodel.csv', index=False)
| Column | Description |
|---|---|
date |
timestamp of booking |
day_of_week |
day of week of booking |
hour |
hour of booking |
weekend |
tag if booking is on weekend |
drop_off_latitude |
drop off latitude |
drop_off_longitude |
drop off longitude |
pick_up_latitude |
pick up latitude |
pick_up_longitude |
pick up longitude |
pick_up_city |
pick up city |
drop_off_city |
drop off city |
pick_up_geohash |
pick up based on precision = 5 |
drop_off_geohash |
drop off geohash based on precision = 5 |
fare |
estimated fare amoount |
pick_up_distance |
estimated distance of driver to passenger |
trip_distance |
estimated trip distance using Manhattan distance |
status |
status of booking |
df_viz = pd.read_csv('df_premodel.csv')
Figure 3 shows that the dataset is somewhat balanced with about $57\%$ of data with status equal to Allocated and $43\%$ of data with status equal to Unallocated. Hence, it is important to consider other metrics aside from accuracy when evaluating the machine learning model.
viz = df_viz.groupby('status')[['status']].agg('count').rename(columns={'status':'count'}).reset_index()
viz['total'] = df_viz.shape[0]
viz['percent'] = viz['count']/viz['total'] * 100
fig, ax = plt.subplots()
g = sns.barplot(x='status', y='percent', data=viz,
order = ['Allocated', 'Unallocated'],
hue= viz.index, dodge=False,
palette = ['#009F35', '#7F7F7F']);
# ax.set_xlabel('Number of Bookings')
ax.set_ylabel('% of Total')
ax.set_xlabel('Ride Status')
ax.yaxis.set_major_formatter(mtick.PercentFormatter())
g.legend_.remove()
plt.show();
status¶As shown in Figure 4, on a daily-basis, the percentage of bookings that were unallocated tend to be during weekdays, specifically during Fridays.
def plot_calmap(df):
series = df['status'].replace({'Unallocated':1, 'Allocated':0})
series.index = pd.to_datetime(df['date'], format='%Y-%m-%d')
df_plot = series.resample('D').mean()*100
fig,ax = calmap.calendarplot(df_plot, fillcolor='grey', cmap='YlOrRd',
linewidth=0.25, fig_kws=dict(figsize=(14,8)))
cbar = fig.colorbar(ax[0].get_children()[1], ax=ax.ravel().tolist(), fraction=0.045, pad=0.05, orientation='horizontal');
cbar.ax.set_xticklabels(['10%','20%','30%','40%','50%','60%','70%' ]);
plot_calmap(df_viz)
In Figure 5, it can be observed that during morning (7-9 AM) the unallocated start to peak especially in business districts of Makati, Ortigas, and BGC. During 3 PM.
# Functions
def plot_heatmaptime(df, latitude, longitude):
with open('mapbox_key.txt') as f:
mapbox = f.read().strip()
a = []
for group, data in df.groupby('hour'):
a.append([[lat,lon] for lat, lon in zip(data[latitude],
data[longitude])])
date = df['hour'].drop_duplicates().sort_values()
date = 'Time : ' + date.astype(str) + ':00'
main = folium.Map([14.5547, 121.0244], zoom_start=13)
hm = HeatMapWithTime(a, index= date.to_list()).add_to(main)
folium.TileLayer(tiles=mapbox, attr='Mapbox attribution').add_to(main)
return main
plot_heatmaptime(df_viz[df_viz['status']=='Unallocated'], 'pick_up_latitude', 'pick_up_longitude')
df_temp = df_viz.copy()
df_temp['status'] = df_temp['status'].replace({'Unallocated':1, 'Allocated':0})
viz = df_temp.groupby('pick_up_geohash')['status'].agg(['sum', 'count']).reset_index()
viz['percent_cancelled'] = viz['sum'] / viz['count']
viz['polygon'] = viz['pick_up_geohash'].apply(lambda x: geohashes_to_polygon(x))
geohash = json.loads(geohash_to_geojson.geohash_to_geojson((viz['pick_up_geohash'].unique())))
token = open("mapbox_token.txt").read() # you need your own token
fig = px.choropleth_mapbox(viz,
geojson=geohash,
locations = 'pick_up_geohash',
featureidkey='properties.id',
color='percent_cancelled',
color_continuous_scale="YlOrRd",
center = {"lat": 14.5547, "lon": 121.02},
opacity = 0.75
)
fig.update_layout(
coloraxis_colorbar={"yanchor":"top", "y":1,
"title": '% of Unallocated Bookings',
"tickvals":[0, 0.25, 0.5, 0.75, 1],
"ticktext":['0%', '25%', '50%', '75%', '100%']},
mapbox = {
'accesstoken': token,
'style': "streets", 'zoom':8},
showlegend = False)
fig.show(renderer = 'notebook')
df_temp = df_viz.copy()
df_temp['status'] = df_temp['status'].replace({'Unallocated':1, 'Allocated':0})
viz = df_temp.groupby('drop_off_geohash')['status'].agg(['sum', 'count']).reset_index()
viz['percent_cancelled'] = viz['sum'] / viz['count']
viz['polygon'] = viz['drop_off_geohash'].apply(lambda x: geohashes_to_polygon(x))
geohash = json.loads(geohash_to_geojson.geohash_to_geojson((viz['drop_off_geohash'].unique())))
token = open("mapbox_token.txt").read() # you need your own token
fig = px.choropleth_mapbox(viz,
geojson=geohash,
locations = 'drop_off_geohash',
featureidkey='properties.id',
color='percent_cancelled',
color_continuous_scale="YlOrRd",
center = {"lat": 14.5547, "lon": 121.02},
opacity = 0.75
)
fig.update_layout(
coloraxis_colorbar={"yanchor":"top", "y":1,
"title": '% of Unallocated Bookings',
"tickvals":[0, 0.25, 0.5, 0.75, 1],
"ticktext":['0%', '25%', '50%', '75%', '100%']},
mapbox = {
'accesstoken': token,
'style': "streets", 'zoom':8},
showlegend = False)
fig.show(renderer = 'notebook')
To prepare the data for machine learning, the categorical variables were vectorized via one-hot encoding using pd.get_dummies(). Additionally, the target variable was encoded via LabelEncoder(). The dataset was split (stratified) into 2: train and test data with test_size = 0.20.
df_viz = pd.read_csv('df_premodel.csv')
df_model = pd.get_dummies(df_viz,
columns=[
'pick_up_geohash',
'drop_off_geohash',
'month',
'hour',
'day_of_week'],
prefix=[
'pick_up_geohash',
'drop_off_geohash',
'month',
'hour',
'day_of_week']).drop(columns=['date',
'fare',
'weekend',
'pick_up_distance',
'drop_off_city',
'pick_up_city',
'drop_off_latitude',
'pick_up_latitude',
'pick_up_longitude',
'drop_off_longitude',
'day',
'week'])
le = LabelEncoder()
y = le.fit_transform(df_model['status'])
feature_names = df_model.drop(columns = 'status').columns
X = df_model.drop(columns = 'status')
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size = 0.20,
random_state = 42,
stratify = y
)
clf = LGBMClassifier(max_depth = 10)
clf.fit(X_train, y_train)
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)
print("\nModel Report - Train Data")
print('\n', classification_report(y_train, y_train_pred))
print("\nModel Report - Test Data")
print('\n', classification_report(y_test, y_test_pred))
fit_params={"early_stopping_rounds":30,
"eval_metric" : 'auc',
"eval_set" : [(X_test,y_test)],
'eval_names': ['valid'],
#'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_099)],
'verbose': 100,
'categorical_feature': 'auto'}
param_test ={'num_leaves': sp_randint(6, 50),
'min_child_samples': sp_randint(100, 500),
'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
'subsample': sp_uniform(loc=0.2, scale=0.8),
'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}
#This parameter defines the number of HP points to be tested
n_HP_points_to_test = 10
clf = LGBMClassifier(max_depth=10,
random_state=42,
silent=True,
metric='None',
n_jobs=4,
n_estimators=1000)
gs = RandomizedSearchCV(
estimator=clf, param_distributions=param_test,
n_iter=n_HP_points_to_test,
scoring='accuracy',
cv=3,
refit=True,
random_state=42,
verbose=False)
params = {'colsample_bytree': 0.9053708647569991,
'min_child_samples': 143,
'min_child_weight': 0.001,
'num_leaves': 15,
'reg_alpha': 2,
'reg_lambda': 20,
'subsample': 0.7818175966851368}
clf = LGBMClassifier(max_depth=10,
metric='None',
n_estimators=1000, n_jobs=4,
random_state=42)
clf.set_params(**params)
clf.fit(X_train, y_train)
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)
print("\nModel Report - Train Data")
print('\n', classification_report(y_train, y_train_pred))
RandomizedSearchCV¶print("\Model Report - Test Data")
print('\n', classification_report(y_test, y_test_pred))
RandomizedSearchCV¶lightgbm.plot_importance(clf, max_num_features = 10, importance_type = 'gain',
grid=False, figsize = (10,7),
height = 0.65,
color = '#009F35');
# Defining Metric:
pcc = (df_viz['status'].value_counts()[1]/df_viz.shape[0])**2 + (df_viz['status'].value_counts()[0]/df_viz.shape[0])**2
1.25*pcc
CatBoostClassifier was also used for sensitivity analysis.
See documentation [here]
clf = CatBoostClassifier(iterations=10,
eval_metric='AUC',
max_depth = 10,
learning_rate=0.1)
clf.fit(X_train, y_train)
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)
print("\nModel Report - Train Data")
print('\n', classification_report(y_train, y_train_pred))
CatBoostClassifier¶print("\nModel Report - Test Data")
print('\n', classification_report(y_test, y_test_pred))
CatBoostClassifier¶Using the learnings from Machine Learning, I was able to expand the limited features available in the dataset through feature engineering. Considering the size of the dataset and the computing power of my laptop, I decided to use LightGBM wherein I obtained a $69\%$ accuracy (over the baseline of $63\%$). The top features identified where 6 PM (hour_18), Friday (day_of_week_Friday), and Sunday (day_of_week_Sunday). Using the findings from this study, this can help passengers, Grab and its partner drivers, and local government units.
For future studies, the following can be considered:
Data
trip_distanceMethodology
Study